{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Otto商品分类——对测试数据进行特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Otto数据集是著名电商Otto提供的一个多类商品分类问题，类别数=9. 每个样本有93维数值型特征（整数，可能表示某种事件发生的次数，已经进行过脱敏处理）。 \n",
    "竞赛官网：https://www.kaggle.com/c/otto-group-product-classification-challenge/data\n",
    "\n",
    "对测试数据进行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 首先 import 必要的模块\n",
    "import pandas as pd \n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 94 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       0       0       0       0       0       0       0       0       0   \n",
       "1   2       2       2      14      16       0       0       0       0       0   \n",
       "2   3       0       1      12       1       0       0       0       0       0   \n",
       "3   4       0       0       0       1       0       0       0       0       0   \n",
       "4   5       1       0       0       1       0       0       1       2       0   \n",
       "\n",
       "    ...     feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  \\\n",
       "0   ...           0        0       11        1       20        0        0   \n",
       "1   ...           0        0        0        0        0        4        0   \n",
       "2   ...           0        0        0        0        2        0        0   \n",
       "3   ...           0        3        1        0        0        0        0   \n",
       "4   ...           0        0        0        0        0        0        0   \n",
       "\n",
       "   feat_91  feat_92  feat_93  \n",
       "0        0        0        0  \n",
       "1        0        2        0  \n",
       "2        0        0        1  \n",
       "3        0        0        0  \n",
       "4        9        0        0  \n",
       "\n",
       "[5 rows x 94 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "test = pd.read_csv(dpath +\"Otto_test.csv\")\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 144368 entries, 0 to 144367\n",
      "Data columns (total 94 columns):\n",
      "id         144368 non-null int64\n",
      "feat_1     144368 non-null int64\n",
      "feat_2     144368 non-null int64\n",
      "feat_3     144368 non-null int64\n",
      "feat_4     144368 non-null int64\n",
      "feat_5     144368 non-null int64\n",
      "feat_6     144368 non-null int64\n",
      "feat_7     144368 non-null int64\n",
      "feat_8     144368 non-null int64\n",
      "feat_9     144368 non-null int64\n",
      "feat_10    144368 non-null int64\n",
      "feat_11    144368 non-null int64\n",
      "feat_12    144368 non-null int64\n",
      "feat_13    144368 non-null int64\n",
      "feat_14    144368 non-null int64\n",
      "feat_15    144368 non-null int64\n",
      "feat_16    144368 non-null int64\n",
      "feat_17    144368 non-null int64\n",
      "feat_18    144368 non-null int64\n",
      "feat_19    144368 non-null int64\n",
      "feat_20    144368 non-null int64\n",
      "feat_21    144368 non-null int64\n",
      "feat_22    144368 non-null int64\n",
      "feat_23    144368 non-null int64\n",
      "feat_24    144368 non-null int64\n",
      "feat_25    144368 non-null int64\n",
      "feat_26    144368 non-null int64\n",
      "feat_27    144368 non-null int64\n",
      "feat_28    144368 non-null int64\n",
      "feat_29    144368 non-null int64\n",
      "feat_30    144368 non-null int64\n",
      "feat_31    144368 non-null int64\n",
      "feat_32    144368 non-null int64\n",
      "feat_33    144368 non-null int64\n",
      "feat_34    144368 non-null int64\n",
      "feat_35    144368 non-null int64\n",
      "feat_36    144368 non-null int64\n",
      "feat_37    144368 non-null int64\n",
      "feat_38    144368 non-null int64\n",
      "feat_39    144368 non-null int64\n",
      "feat_40    144368 non-null int64\n",
      "feat_41    144368 non-null int64\n",
      "feat_42    144368 non-null int64\n",
      "feat_43    144368 non-null int64\n",
      "feat_44    144368 non-null int64\n",
      "feat_45    144368 non-null int64\n",
      "feat_46    144368 non-null int64\n",
      "feat_47    144368 non-null int64\n",
      "feat_48    144368 non-null int64\n",
      "feat_49    144368 non-null int64\n",
      "feat_50    144368 non-null int64\n",
      "feat_51    144368 non-null int64\n",
      "feat_52    144368 non-null int64\n",
      "feat_53    144368 non-null int64\n",
      "feat_54    144368 non-null int64\n",
      "feat_55    144368 non-null int64\n",
      "feat_56    144368 non-null int64\n",
      "feat_57    144368 non-null int64\n",
      "feat_58    144368 non-null int64\n",
      "feat_59    144368 non-null int64\n",
      "feat_60    144368 non-null int64\n",
      "feat_61    144368 non-null int64\n",
      "feat_62    144368 non-null int64\n",
      "feat_63    144368 non-null int64\n",
      "feat_64    144368 non-null int64\n",
      "feat_65    144368 non-null int64\n",
      "feat_66    144368 non-null int64\n",
      "feat_67    144368 non-null int64\n",
      "feat_68    144368 non-null int64\n",
      "feat_69    144368 non-null int64\n",
      "feat_70    144368 non-null int64\n",
      "feat_71    144368 non-null int64\n",
      "feat_72    144368 non-null int64\n",
      "feat_73    144368 non-null int64\n",
      "feat_74    144368 non-null int64\n",
      "feat_75    144368 non-null int64\n",
      "feat_76    144368 non-null int64\n",
      "feat_77    144368 non-null int64\n",
      "feat_78    144368 non-null int64\n",
      "feat_79    144368 non-null int64\n",
      "feat_80    144368 non-null int64\n",
      "feat_81    144368 non-null int64\n",
      "feat_82    144368 non-null int64\n",
      "feat_83    144368 non-null int64\n",
      "feat_84    144368 non-null int64\n",
      "feat_85    144368 non-null int64\n",
      "feat_86    144368 non-null int64\n",
      "feat_87    144368 non-null int64\n",
      "feat_88    144368 non-null int64\n",
      "feat_89    144368 non-null int64\n",
      "feat_90    144368 non-null int64\n",
      "feat_91    144368 non-null int64\n",
      "feat_92    144368 non-null int64\n",
      "feat_93    144368 non-null int64\n",
      "dtypes: int64(94)\n",
      "memory usage: 103.5 MB\n"
     ]
    }
   ],
   "source": [
    "test.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征工程\n",
    "测试数据的特征工程和训练数据一样\n",
    "特征编码的模型由训练集得到，此次将模型装载进来就好"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征变换，这个是体力活\n",
    "1. 取对数（对线性模型很重要，树模型影响不大）\n",
    "2. tf-idf\n",
    "3. 原始特征组合（加减乘除。如果是计数特征，乘法表示“and”，更有意义（FM）；或者可采用GBDT做特征编码，实现更高阶特征组合；原始特征维数太高，也可以先用基础模型得到特征的重要性，对重要的特征再组合）（CTR部分讲解）\n",
    "4. t-SNE及PCA降维后的特征 （降维部分讲解）\n",
    "5. 统计特征，如sum of the row, number of non-zero, max of the row，x-mean，个人感觉对这个数据集意义不大"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分开特征和id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#暂存id，用于保存特征变换后的结果并用于结果提交\n",
    "test_id = test['id']\n",
    "# drop ids and get labels\n",
    "X_test = test.drop([\"id\"], axis=1)\n",
    "\n",
    "#保存特征名字\n",
    "columns_org = X_test.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. feat编码：log(x+1)\n",
    "原始特征feat_x看起来像计数特征，取log运算更接近人对数字的敏感度，更适合线性模型。\n",
    "同时也可以降低长维分布中大数值的影响，减弱长维分布的长尾性。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat_1_log</th>\n",
       "      <th>feat_2_log</th>\n",
       "      <th>feat_3_log</th>\n",
       "      <th>feat_4_log</th>\n",
       "      <th>feat_5_log</th>\n",
       "      <th>feat_6_log</th>\n",
       "      <th>feat_7_log</th>\n",
       "      <th>feat_8_log</th>\n",
       "      <th>feat_9_log</th>\n",
       "      <th>feat_10_log</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84_log</th>\n",
       "      <th>feat_85_log</th>\n",
       "      <th>feat_86_log</th>\n",
       "      <th>feat_87_log</th>\n",
       "      <th>feat_88_log</th>\n",
       "      <th>feat_89_log</th>\n",
       "      <th>feat_90_log</th>\n",
       "      <th>feat_91_log</th>\n",
       "      <th>feat_92_log</th>\n",
       "      <th>feat_93_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.386294</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.484907</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>3.044522</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.098612</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>2.708050</td>\n",
       "      <td>2.833213</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.609438</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>2.564949</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.386294</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.386294</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.302585</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat_1_log  feat_2_log  feat_3_log  feat_4_log  feat_5_log  feat_6_log  \\\n",
       "0    0.000000    0.000000    0.000000    0.000000         0.0         0.0   \n",
       "1    1.098612    1.098612    2.708050    2.833213         0.0         0.0   \n",
       "2    0.000000    0.693147    2.564949    0.693147         0.0         0.0   \n",
       "3    0.000000    0.000000    0.000000    0.693147         0.0         0.0   \n",
       "4    0.693147    0.000000    0.000000    0.693147         0.0         0.0   \n",
       "\n",
       "   feat_7_log  feat_8_log  feat_9_log  feat_10_log     ...       feat_84_log  \\\n",
       "0    0.000000    0.000000         0.0     1.386294     ...               0.0   \n",
       "1    0.000000    0.000000         0.0     0.000000     ...               0.0   \n",
       "2    0.000000    0.000000         0.0     0.000000     ...               0.0   \n",
       "3    0.000000    0.000000         0.0     0.000000     ...               0.0   \n",
       "4    0.693147    1.098612         0.0     1.386294     ...               0.0   \n",
       "\n",
       "   feat_85_log  feat_86_log  feat_87_log  feat_88_log  feat_89_log  \\\n",
       "0     0.000000     2.484907     0.693147     3.044522     0.000000   \n",
       "1     0.000000     0.000000     0.000000     0.000000     1.609438   \n",
       "2     0.000000     0.000000     0.000000     1.098612     0.000000   \n",
       "3     1.386294     0.693147     0.000000     0.000000     0.000000   \n",
       "4     0.000000     0.000000     0.000000     0.000000     0.000000   \n",
       "\n",
       "   feat_90_log  feat_91_log  feat_92_log  feat_93_log  \n",
       "0          0.0     0.000000     0.000000     0.000000  \n",
       "1          0.0     0.000000     1.098612     0.000000  \n",
       "2          0.0     0.000000     0.000000     0.693147  \n",
       "3          0.0     0.000000     0.000000     0.000000  \n",
       "4          0.0     2.302585     0.000000     0.000000  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_log = np.log1p(X_test)\n",
    "\n",
    "#重新组成DataFrame\n",
    "feat_names = columns_org + \"_log\"\n",
    "X_test_log = pd.DataFrame(columns = feat_names, data = X_test_log.values)\n",
    "\n",
    "X_test_log.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. feat编码：TF-IDF\n",
    "原始特征feat_x看起来像计数特征，类似文本分析中词频特征的处理，TF-IDF可以突出对特别类别有贡献的低频词。\n",
    "这里原始特征已经是计数特征了，直接调用TfidfTransformer，将计数特征变成TF-IDF\n",
    "如果输入是原始文本，需要将计数功能（TF）和IDF功能集中在一起，用TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat_1_tfidf</th>\n",
       "      <th>feat_2_tfidf</th>\n",
       "      <th>feat_3_tfidf</th>\n",
       "      <th>feat_4_tfidf</th>\n",
       "      <th>feat_5_tfidf</th>\n",
       "      <th>feat_6_tfidf</th>\n",
       "      <th>feat_7_tfidf</th>\n",
       "      <th>feat_8_tfidf</th>\n",
       "      <th>feat_9_tfidf</th>\n",
       "      <th>feat_10_tfidf</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84_tfidf</th>\n",
       "      <th>feat_85_tfidf</th>\n",
       "      <th>feat_86_tfidf</th>\n",
       "      <th>feat_87_tfidf</th>\n",
       "      <th>feat_88_tfidf</th>\n",
       "      <th>feat_89_tfidf</th>\n",
       "      <th>feat_90_tfidf</th>\n",
       "      <th>feat_91_tfidf</th>\n",
       "      <th>feat_92_tfidf</th>\n",
       "      <th>feat_93_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.183240</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.411895</td>\n",
       "      <td>0.052224</td>\n",
       "      <td>0.842245</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.067950</td>\n",
       "      <td>0.078094</td>\n",
       "      <td>0.443016</td>\n",
       "      <td>0.493584</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.122674</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.061405</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.058829</td>\n",
       "      <td>0.572102</td>\n",
       "      <td>0.046477</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078248</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.069951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.044693</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.135951</td>\n",
       "      <td>0.033452</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.047877</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.043471</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.059028</td>\n",
       "      <td>0.079725</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.159226</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.548938</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat_1_tfidf  feat_2_tfidf  feat_3_tfidf  feat_4_tfidf  feat_5_tfidf  \\\n",
       "0      0.000000      0.000000      0.000000      0.000000           0.0   \n",
       "1      0.067950      0.078094      0.443016      0.493584           0.0   \n",
       "2      0.000000      0.058829      0.572102      0.046477           0.0   \n",
       "3      0.000000      0.000000      0.000000      0.044693           0.0   \n",
       "4      0.047877      0.000000      0.000000      0.043471           0.0   \n",
       "\n",
       "   feat_6_tfidf  feat_7_tfidf  feat_8_tfidf  feat_9_tfidf  feat_10_tfidf  \\\n",
       "0           0.0      0.000000      0.000000           0.0       0.183240   \n",
       "1           0.0      0.000000      0.000000           0.0       0.000000   \n",
       "2           0.0      0.000000      0.000000           0.0       0.000000   \n",
       "3           0.0      0.000000      0.000000           0.0       0.000000   \n",
       "4           0.0      0.059028      0.079725           0.0       0.159226   \n",
       "\n",
       "       ...        feat_84_tfidf  feat_85_tfidf  feat_86_tfidf  feat_87_tfidf  \\\n",
       "0      ...                  0.0       0.000000       0.411895       0.052224   \n",
       "1      ...                  0.0       0.000000       0.000000       0.000000   \n",
       "2      ...                  0.0       0.000000       0.000000       0.000000   \n",
       "3      ...                  0.0       0.135951       0.033452       0.000000   \n",
       "4      ...                  0.0       0.000000       0.000000       0.000000   \n",
       "\n",
       "   feat_88_tfidf  feat_89_tfidf  feat_90_tfidf  feat_91_tfidf  feat_92_tfidf  \\\n",
       "0       0.842245       0.000000            0.0       0.000000       0.000000   \n",
       "1       0.000000       0.122674            0.0       0.000000       0.061405   \n",
       "2       0.078248       0.000000            0.0       0.000000       0.000000   \n",
       "3       0.000000       0.000000            0.0       0.000000       0.000000   \n",
       "4       0.000000       0.000000            0.0       0.548938       0.000000   \n",
       "\n",
       "   feat_93_tfidf  \n",
       "0       0.000000  \n",
       "1       0.000000  \n",
       "2       0.069951  \n",
       "3       0.000000  \n",
       "4       0.000000  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform counts to TFIDF features\n",
    "#from sklearn.feature_extraction.text import TfidfTransformer\n",
    "#tfidf = TfidfTransformer()\n",
    "\n",
    "import cPickle\n",
    "tfidf = cPickle.load(open(\"tfidf.pkl\", 'rb'))\n",
    "\n",
    "#输出稀疏矩阵\n",
    "X_test_tfidf = tfidf.transform(X_test).toarray()\n",
    "\n",
    "#重新组成DataFrame,为了可视化\n",
    "feat_names = columns_org + \"_tfidf\"\n",
    "X_test_tfidf = pd.DataFrame(columns = feat_names, data = X_test_tfidf)\n",
    "\n",
    "X_test_tfidf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 其他特征工程\n",
    "5. 一行的最大值、和、非0元素数目\n",
    "将这些特征加到原始特征中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#X_test['feat_max'] = X_test.max(axis=1)\n",
    "#X_test['feat_sum'] = X_test.sum(axis=1)\n",
    "#X_test['feat_zero_count'] = X_test.apply(lambda x : x.value_counts().get(0,0),axis=1)\n",
    "#X_test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理\n",
    "由于数据极度稀疏，数据缩放应采用MinMaxScaler，使得变换后的数据继续保持稀疏。\n",
    "如果将特征看似词频这种特征，不用缩放，而是用每个样本用模长归一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 对原始数据缩放\n",
    "#from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "#ms_org = MinMaxScaler()\n",
    "\n",
    "import cPickle\n",
    "ms_org = cPickle.load(open(\"MinMaxSclaer_org.pkl\", 'rb'))\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_org = X_test.columns\n",
    "\n",
    "# 用训练模型训练好的缩放器对测试数据进行特征缩放：transform\n",
    "X_test = ms_org.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 对log数据缩放\n",
    "#from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "#ms_log = MinMaxScaler()\n",
    "\n",
    "import cPickle\n",
    "ms_log = cPickle.load(open(\"MinMaxSclaer_log.pkl\", 'rb'))\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_log = X_test_log.columns\n",
    "\n",
    "# 用训练模型训练好的缩放器对测试数据进行特征缩放：transform\n",
    "X_test_log = ms_log.transform(X_test_log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 对tf-idf数据缩放\n",
    "#from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "#ms_tfidf = MinMaxScaler()\n",
    "\n",
    "import cPickle\n",
    "ms_tfidf = cPickle.load(open(\"MinMaxSclaer_tfidf.pkl\", 'rb'))\n",
    "\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_tfidf = X_test_tfidf.columns\n",
    "\n",
    "# 用训练模型训练好的缩放器对测试数据进行特征缩放：transform\n",
    "X_test_tfidf = ms_tfidf.transform(X_test_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存原始特征\n",
    "feat_names = columns_org\n",
    "test_org = pd.concat([test_id, pd.DataFrame(columns = feat_names_org, data = X_test)], axis = 1)\n",
    "test_org.to_csv('./data/'+'Otto_FE_test_org.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存log特征变换结果\n",
    "test_log = pd.concat([test_id, pd.DataFrame(columns = feat_names_log, data = X_test_log)], axis = 1)\n",
    "test_log.to_csv('./data/'+'Otto_FE_test_log.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存tf-idf特征变换结果\n",
    "test_tfidf = pd.concat([test_id, pd.DataFrame(columns = feat_names_tfidf, data = X_test_tfidf)], axis = 1)\n",
    "test_tfidf.to_csv('./data/'+'Otto_FE_test_tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
